{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_space(string, threshold = 0.3):\n",
    "    string = [s for s in string if not (s == ' ' and random.random() >= threshold)]\n",
    "    return ''.join(string)\n",
    "\n",
    "def package(string, repeat = 2):\n",
    "    result = [(string, string)]\n",
    "    result.append((string.lower(), string.lower()))\n",
    "    \n",
    "    for _ in range(repeat):\n",
    "        result.append((remove_space(string), string))\n",
    "        result.append((remove_space(string.lower()), string.lower()))\n",
    "        \n",
    "    return result\n",
    "\n",
    "def generate_short(string):\n",
    "    splitted = string.split()\n",
    "    random_length = random.randint(2, min(len(splitted), 10))\n",
    "    end = random.randint(0 + random_length, len(splitted))\n",
    "    return ' '.join(splitted[end - random_length: end])\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "def loop(strings):\n",
    "    results = []\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        try:\n",
    "            p = package(generate_short(strings[i]))\n",
    "            results.extend(p)\n",
    "        except:\n",
    "            pass\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['../pure-text/filtered-dumping-wiki.txt',\n",
    "        '../pure-text/dumping-cleaned-news.txt',\n",
    "        '../pure-text/dumping-iium.txt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2037249"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[0]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Huntelaar (lahir 12 Ogos 1983)', 'Huntelaar (lahir 12 Ogos 1983)'),\n",
       " ('huntelaar (lahir 12 ogos 1983)', 'huntelaar (lahir 12 ogos 1983)'),\n",
       " ('Huntelaar(lahir 12 Ogos1983)', 'Huntelaar (lahir 12 Ogos 1983)'),\n",
       " ('huntelaar(lahir 12ogos1983)', 'huntelaar (lahir 12 ogos 1983)'),\n",
       " ('Huntelaar (lahir 12Ogos1983)', 'Huntelaar (lahir 12 Ogos 1983)'),\n",
       " ('huntelaar(lahir 12ogos1983)', 'huntelaar (lahir 12 ogos 1983)')]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "package(generate_short(data[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = random.sample(data, 500000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 66%|██████▌   | 20541/31250 [00:00<00:00, 31523.82it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 35826.81it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 35136.66it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 43035.06it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 38535.77it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 40750.20it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 39468.95it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 44283.36it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 40709.61it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 34949.87it/s]\n",
      " 93%|█████████▎| 28911/31250 [00:00<00:00, 41288.21it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 33850.85it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 41634.56it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 36835.98it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 35705.44it/s]\n",
      "100%|██████████| 31250/31250 [00:00<00:00, 32657.17it/s]\n"
     ]
    }
   ],
   "source": [
    "import cleaning\n",
    "\n",
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2984904"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Kebakaran peluru 0,67 baru direka dibina hanya',\n",
       "  'Kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('kebakaran peluru 0,67 baru direka dibina hanya',\n",
       "  'kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('Kebakaran peluru 0,67 baru direka dibina hanya',\n",
       "  'Kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('kebakaranpeluru0,67baru direkadibinahanya',\n",
       "  'kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('Kebakaran peluru0,67baru direkadibinahanya',\n",
       "  'Kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('kebakaranpeluru0,67 baru direka dibinahanya',\n",
       "  'kebakaran peluru 0,67 baru direka dibina hanya'),\n",
       " ('Manakala Amy, bekas isteri Aaron menjadi batu',\n",
       "  'Manakala Amy, bekas isteri Aaron menjadi batu'),\n",
       " ('manakala amy, bekas isteri aaron menjadi batu',\n",
       "  'manakala amy, bekas isteri aaron menjadi batu'),\n",
       " ('Manakala Amy, bekasisteri Aaronmenjadibatu',\n",
       "  'Manakala Amy, bekas isteri Aaron menjadi batu'),\n",
       " ('manakalaamy,bekasisteri aaron menjadibatu',\n",
       "  'manakala amy, bekas isteri aaron menjadi batu')]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results1[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('segmentation-wiki-short.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(files[1]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "del results1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = random.sample(data, 300000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 18750/18750 [00:00<00:00, 35175.34it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 31118.67it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 31782.93it/s]\n",
      "\n",
      " 68%|██████▊   | 12800/18750 [00:00<00:00, 18722.14it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 33682.58it/s]\n",
      " 79%|███████▉  | 14801/18750 [00:00<00:00, 19008.23it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 31072.21it/s]\n",
      " 96%|█████████▌| 18023/18750 [00:00<00:00, 22673.88it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 33051.51it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 30055.02it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 26523.65it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 29880.77it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 29883.97it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 28797.18it/s]\n",
      "100%|██████████| 18750/18750 [00:00<00:00, 27573.71it/s]\n"
     ]
    }
   ],
   "source": [
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1786146"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('(MRL) di pejabat Menteri Besar', '(MRL) di pejabat Menteri Besar'),\n",
       " ('(mrl) di pejabat menteri besar', '(mrl) di pejabat menteri besar'),\n",
       " ('(MRL) dipejabatMenteri Besar', '(MRL) di pejabat Menteri Besar'),\n",
       " ('(mrl)dipejabatmenteribesar', '(mrl) di pejabat menteri besar'),\n",
       " ('(MRL)dipejabatMenteri Besar', '(MRL) di pejabat Menteri Besar'),\n",
       " ('(mrl) dipejabatmenteri besar', '(mrl) di pejabat menteri besar'),\n",
       " ('\"Islam yang memanusiakan manusia, cinta tanah air.',\n",
       "  '\"Islam yang memanusiakan manusia, cinta tanah air.'),\n",
       " ('\"islam yang memanusiakan manusia, cinta tanah air.',\n",
       "  '\"islam yang memanusiakan manusia, cinta tanah air.'),\n",
       " ('\"Islamyang memanusiakanmanusia,cintatanahair.',\n",
       "  '\"Islam yang memanusiakan manusia, cinta tanah air.'),\n",
       " ('\"islamyangmemanusiakanmanusia,cintatanah air.',\n",
       "  '\"islam yang memanusiakan manusia, cinta tanah air.')]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results1[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('segmentation-news-short.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1121978"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[2]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = random.sample(data, 400000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 25000/25000 [00:00<00:00, 36603.77it/s]\n",
      " 90%|█████████ | 22597/25000 [00:00<00:00, 3758.52it/s]]\n",
      " 88%|████████▊ | 22082/25000 [00:00<00:01, 2268.62it/s]]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 36600.24it/s]\n",
      "\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 34357.51it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 33662.95it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 31947.49it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 31017.07it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 36501.14it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 35269.11it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 33123.27it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 34732.84it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 35693.66it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 34374.67it/s]\n",
      "100%|██████████| 25000/25000 [00:00<00:00, 34179.73it/s]\n"
     ]
    }
   ],
   "source": [
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2398446"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ni saya kongsi pengalaman saya cari kerja, kali ni',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('ni saya kongsi pengalaman saya cari kerja, kali ni',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('nisayakongsipengalamansayacarikerja, kali ni',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('nisayakongsi pengalaman saya carikerja, kalini',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('ni sayakongsipengalamansaya cari kerja,kali ni',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('ni sayakongsipengalamansayacarikerja,kali ni',\n",
       "  'ni saya kongsi pengalaman saya cari kerja, kali ni'),\n",
       " ('Apa dia kisah.', 'Apa dia kisah.'),\n",
       " ('apa dia kisah.', 'apa dia kisah.'),\n",
       " ('Apadiakisah.', 'Apa dia kisah.'),\n",
       " ('apadiakisah.', 'apa dia kisah.')]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results1[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('segmentation-iium-short.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
